import sys
sys.path.append("C:\\Users\\RDBanjacJe\\Desktop\\ELMToolBox")
from elmtoolbox.preprocessing import *
from elmtoolbox.trajectory import *
from elmtoolbox.postprocessing import *
from elmtoolbox.statistical_analysis import *
from elmtoolbox.reference_analysis import *
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from catboost import CatBoostRegressor
from sklearn.model_selection import GroupShuffleSplit
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score,cross_val_predict
from sklearn.model_selection import GroupKFold
from scipy.stats import pearsonr
import shap
import scipy as sp
import scipy.stats as stats
import seaborn as sns
from sklearn.metrics import r2_score
from sklearn.feature_selection import VarianceThreshold
from skbio.stats.composition import clr
import itertools
from matplotlib.lines import Line2D
import statsmodels.api as sm
import math
import scipy
from matplotlib.patches import Patch
from matplotlib.ticker import MaxNLocator
from catboost import Pool
import pathlib
pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_colwidth = None
n_splits = 5
test_size = 0.5
def fix_layout(width:int=95):
from IPython.core.display import display, HTML
display(HTML('<style>.container { width:' + str(width) + '% !important; }</style>'))
fix_layout()
df = pd.read_csv("../INPUT_FILES/mousedata.xls", sep="\t")
# add prefix to all bacteria columns
bacteria_names = df.columns[df.columns.str.startswith("g__")]
df = df.rename(mapper=dict([(b, f"bacteria_{b}") for b in bacteria_names]), axis=1)
# rename mouseID to subjectID
df = df.rename(mapper={"mouseID": "subjectID"}, axis=1)
# rename relativeTime to age_at_collection
df = df.rename(mapper={"relativeTime": "age_at_collection"}, axis=1)
# rename diet to group
df = df.rename(mapper={"diet": "group"}, axis=1)
# Songbird was complaining on type, so we put it explicitly
df.sampleID = df.sampleID.astype(str)
df.subjectID = df.subjectID.astype(str)
feature_columns = df.columns[df.columns.str.startswith("bacteria_")]
metadata_columns = df.columns[~df.columns.str.startswith("bacteria_")]
def fix_zeros(row):
for c in feature_columns:
row[c] = 1e-10 if row[c]==0.0 or row[c]<1e-10 else row[c]
return row
# put zeros and very small values to 1e-10 so ratios are better handled
df = df.apply(lambda row: fix_zeros(row), axis=1)
df.shape
(137, 41)
healthy_reference¶df["healthy_reference"] = False
df.loc[df["status"]==1, "healthy_reference"] = True
df["healthy_reference"].value_counts()
True 69 False 68 Name: healthy_reference, dtype: int64
dataset_type¶df["dataset_type"] = "Test"
for g in df.group.unique():
df_ref_g = df[(df.healthy_reference==True)&(df.group==g)]
train_idx_g, val_idx_g = next(GroupShuffleSplit(test_size=test_size, n_splits=2, random_state=7).split(df_ref_g, groups=df_ref_g["subjectID"]))
df.loc[df_ref_g.iloc[train_idx_g].index, "dataset_type"] = "Train"
df.loc[df_ref_g.iloc[val_idx_g].index, "dataset_type"] = "Validation"
print(f"---------- {g} ----------")
print(f"Train data size: {df[(df.dataset_type=='Train')&(df.group==g)].shape[0]:<4} samples | {len(df[(df.dataset_type=='Train')&(df.group==g)].subjectID.unique()):<4} infants")
print(f"Validation data size: {df[(df.dataset_type=='Validation')&(df.group==g)].shape[0]:<4} samples | {len(df[(df.dataset_type=='Validation')&(df.group==g)].subjectID.unique()):<4} infants")
print(f"Test data size: {df[(df.dataset_type=='Test')&(df.group==g)].shape[0]:<4} samples | {len(df[(df.dataset_type=='Test')&(df.group==g)].subjectID.unique()):<4} infants")
print(f"---------- ALL ----------")
print(f"Train data size: {df[(df.dataset_type=='Train')].shape[0]:<4} samples | {len(df[(df.dataset_type=='Train')].subjectID.unique()):<4} infants")
print(f"Validation data size: {df[(df.dataset_type=='Validation')].shape[0]:<4} samples | {len(df[(df.dataset_type=='Validation')].subjectID.unique()):<4} infants")
print(f"Test data size: {df[(df.dataset_type=='Test')].shape[0]:<4} samples | {len(df[(df.dataset_type=='Test')].subjectID.unique()):<4} infants")
---------- BK ---------- Train data size: 8 samples | 3 infants Validation data size: 7 samples | 3 infants Test data size: 68 samples | 6 infants ---------- Western ---------- Train data size: 27 samples | 3 infants Validation data size: 27 samples | 3 infants Test data size: 0 samples | 0 infants ---------- ALL ---------- Train data size: 35 samples | 3 infants Validation data size: 34 samples | 3 infants Test data size: 68 samples | 6 infants
classification_dataset_type and classification_label¶#df["dataset_type_classification"] = "Test"
train_idx1, test_idx1 = next(GroupShuffleSplit(test_size=test_size, n_splits=n_splits, random_state=7).split(df[df.healthy_reference==True].index, groups=df[df.healthy_reference==True]['subjectID']))
train_idx2, test_idx2 = next(GroupShuffleSplit(test_size=test_size, n_splits=n_splits, random_state=7).split(df[df.healthy_reference==False].index, groups=df[df.healthy_reference==False]['subjectID']))
df.loc[df[df.healthy_reference==True].iloc[train_idx1].index, "classification_dataset_type"] = "Train-1"
df.loc[df[df.healthy_reference==True].iloc[test_idx1].index, "classification_dataset_type"] = "Test-1"
df.loc[df[df.healthy_reference==False].iloc[train_idx2].index, "classification_dataset_type"] = "Train-2"
df.loc[df[df.healthy_reference==False].iloc[test_idx2].index, "classification_dataset_type"] = "Test-2"
df.loc[df.healthy_reference==True, "classification_label"] = 1
df.loc[df.healthy_reference==False, "classification_label"] = -1
fig = sampling_statistics(df, group="group", start_age=0, limit_age=80, time_unit_size=30, time_unit_name="months", file_name=None, height=400, width=1500)
#cut bacteria from prefix
short_bacteria_name = lambda x: x[9:]
fig1, fig2 = plot_bacteria_abundance_heatmaps(df, bacteria_names=feature_columns, short_bacteria_name=short_bacteria_name, time_unit_name="days", time_unit_size=1, avg_fn=np.median, fillna=False)
df.head()
| sampleID | bacteria_g__Akkermansia | bacteria_g__Alistipes | bacteria_g__Anaerofilum | bacteria_g__Anaerofustis | bacteria_g__Anaerostipes | bacteria_g__Anaerotruncus | bacteria_g__Anaerovorax | bacteria_g__Bacteroides | bacteria_g__Bilophila | bacteria_g__Bryantella | bacteria_g__Clostridium | bacteria_g__Collinsella | bacteria_g__Coprobacillus | bacteria_g__Dorea | bacteria_g__Eggerthella | bacteria_g__Enterobacter | bacteria_g__Enterococcus | bacteria_g__ErysipelotrichaceaeIncertaeSedis | bacteria_g__Eubacterium | bacteria_g__Faecalibacterium | bacteria_g__Hespellia | bacteria_g__Holdemania | bacteria_g__Hydrogenophaga | bacteria_g__LachnospiraceaeIncertaeSedis | bacteria_g__Lactococcus | bacteria_g__Mogibacterium | bacteria_g__nan | bacteria_g__Parabacteroides | bacteria_g__Prevotella | bacteria_g__Roseburia | bacteria_g__RuminococcaceaeIncertaeSedis | bacteria_g__Ruminococcus | bacteria_g__Subdoligranulum | bacteria_g__Sutterella | bacteria_g__Turicibacter | subjectID | date | group | age_at_collection | status | healthy_reference | dataset_type | classification_dataset_type | classification_label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | PM1:20080108 | 5.198270e+00 | 8.861973 | 1.000000e-10 | 1.000000e-10 | 6.178487 | 6.178487e+00 | 5.198270e+00 | 13.537929 | 1.000000e-10 | 7.971544e+00 | 1.000000e-10 | 5.198270e+00 | 11.462721 | 6.178487 | 1.000000e-10 | 1.000000e-10 | 1.000000e-10 | 9.482376 | 7.750109e+00 | 11.886731 | 1.000000e-10 | 6.178487e+00 | 1.000000e-10 | 12.559548 | 1.000000e-10 | 1.000000e-10 | 14.360622 | 10.713877 | 14.109796 | 1.000000e-10 | 7.488414 | 7.750109e+00 | 1.000000e-10 | 1.000000e-10 | 1.000000e-10 | PM1 | 2008-01-08 | BK | 22 | 0 | False | Test | Test-2 | -1.0 |
| 1 | PM1:20080114 | 8.163470e+00 | 8.685500 | 1.000000e-10 | 1.000000e-10 | 5.198270 | 6.496426e+00 | 1.000000e-10 | 14.011826 | 6.178487e+00 | 8.916306e+00 | 1.000000e-10 | 4.237039e+00 | 9.887873 | 5.198270 | 4.237039e+00 | 1.000000e-10 | 1.000000e-10 | 9.068010 | 5.198270e+00 | 9.887873 | 1.000000e-10 | 1.000000e-10 | 1.000000e-10 | 11.525241 | 1.000000e-10 | 1.000000e-10 | 14.508325 | 11.041757 | 8.163470 | 1.000000e-10 | 6.756795 | 9.408481e+00 | 1.000000e-10 | 4.237039e+00 | 1.000000e-10 | PM1 | 2008-01-14 | BK | 28 | 0 | False | Test | Test-2 | -1.0 |
| 2 | PM1:20071211 | 8.536715e+00 | 6.230243 | 1.000000e-10 | 1.000000e-10 | 6.230243 | 1.000000e-10 | 1.000000e-10 | 12.998976 | 1.000000e-10 | 1.000000e-10 | 9.213329e+00 | 1.034129e+01 | 7.540594 | 7.802337 | 5.249333e+00 | 9.213329e+00 | 1.000000e-10 | 10.671213 | 6.230243e+00 | 7.220602 | 1.000000e-10 | 5.249333e+00 | 1.000000e-10 | 11.811211 | 1.000000e-10 | 1.000000e-10 | 14.468348 | 11.044470 | 13.395906 | 6.230243e+00 | 7.220602 | 1.000000e-10 | 6.230243e+00 | 5.249333e+00 | 5.249333e+00 | PM1 | 2007-12-11 | BK | 0 | 0 | False | Test | Test-2 | -1.0 |
| 3 | PM1:20080121 | 6.658211e+00 | 8.647458 | 1.000000e-10 | 1.000000e-10 | 6.658211 | 5.101538e+00 | 5.101538e+00 | 14.394351 | 6.080373e+00 | 8.968667e+00 | 1.000000e-10 | 1.000000e-10 | 11.103943 | 6.658211 | 1.000000e-10 | 1.000000e-10 | 1.000000e-10 | 9.231221 | 7.389452e+00 | 10.787631 | 5.101538e+00 | 5.101538e+00 | 1.000000e-10 | 12.136030 | 1.000000e-10 | 1.000000e-10 | 14.971828 | 10.942270 | 11.451726 | 1.000000e-10 | 7.872418 | 9.382984e+00 | 1.000000e-10 | 6.658211e+00 | 1.000000e-10 | PM1 | 2008-01-21 | BK | 35 | 0 | False | Test | Test-2 | -1.0 |
| 4 | PM1:20071217 | 1.000000e-10 | 9.300649 | 1.000000e-10 | 5.249333e+00 | 5.249333 | 5.249333e+00 | 1.000000e-10 | 13.803455 | 8.215758e+00 | 5.249333e+00 | 1.000000e-10 | 1.000000e-10 | 9.912834 | 11.255881 | 1.000000e-10 | 1.000000e-10 | 1.000000e-10 | 6.808785 | 1.000000e-10 | 13.415600 | 1.000000e-10 | 1.000000e-10 | 1.000000e-10 | 11.811211 | 1.000000e-10 | 1.000000e-10 | 14.718745 | 12.505766 | 15.998821 | 1.000000e-10 | 7.220602 | 8.023806e+00 | 1.000000e-10 | 5.249333e+00 | 1.000000e-10 | PM1 | 2007-12-17 | BK | 6 | 0 | False | Test | Test-2 | -1.0 |
df.to_csv("../INPUT_FILES/website_mousedata.xls", sep="\t", index=False)
_bacteria_names = list(map(lambda x: f"bacteria_{x}", bacteria_names))
nice_name = lambda x: x[9:].replace("_", " ")
if max(df.age_at_collection.values) < 100:
time_unit_name="days"
time_unit_size=1
else:
time_unit_name="months"
time_unit_size=30
num_cols = 3
total_num_rows = len(_bacteria_names)//num_cols+1
fig = dataset_bacteria_abundances(df, _bacteria_names, time_unit_size=time_unit_size, time_unit_name=time_unit_name, num_cols=num_cols, nice_name=nice_name, file_name=None, width=1200, height=200*total_num_rows, website=False)
plot_diversity(df, feature_columns, diversity="shannon", group="group", layout_height=1000, layout_width=1200);
a = reference_analysis(df, feature_columns, nice_name=lambda x: x[12:], show=True, website=False);
a = reference_analysis(df, feature_columns, nice_name=lambda x: x[12:], show=False, website=True);
a.show()
(70, 35) (70, 35)
C:\Anaconda3\envs\microbiome\lib\site-packages\plotly\matplotlylib\mplexporter\exporter.py:84: UserWarning: Blended transforms not yet supported. Zoom behavior may not work as expected. C:\Anaconda3\envs\microbiome\lib\site-packages\plotly\matplotlylib\renderer.py:410: UserWarning: Bummer! Plotly can currently only draw Line2D objects from matplotlib that are in 'data' coordinates! C:\Anaconda3\envs\microbiome\lib\site-packages\plotly\matplotlylib\renderer.py:474: UserWarning: Dang! That path collection is out of this world. I totally don't know what to do with it yet! Plotly can only import path collections linked to 'data' coordinates
type(fig)
NoneType